Group X Data Analysis Project

Project Overview

Goal:

  • Identification of RNA-Dependent Proteins Build a robust analysis pipeline to identify proteins with altered distribution profiles after RNase treatment, using mass spectrometry data from 7,159 proteins isolated from synchronized mitotic HeLa cells.

  • Identification of RBPs specifically active during mitosis Compare RNA-dependent protein profiles from synchronized mitotic cells with those from unsynchronized cells to find RBPs that are exclusively active during mitosis.

  • Screening for RBP complexes active in Mitosis Investigate whether mitosis-specific RBPs co-segregate in control gradients aiming to detect both known and potentially novel RNA-dependent protein complexes (using different clustering methods).

  • Testing R-DeeP for further use in molecular weight predictions Test if a protein’s position in the sucrose gradient correlates with its molecular weight, using linear regression to evaluate predictive potential.

Libraries

# General
library(tidyverse) # Collection of tidy data tools (ggplot2, dplyr, tidyr, etc.) for wrangling, plotting, and transforming datasets
library(tidyr)# Provides functions for transforming and reshaping data structures — used for pivot_wider to convert long-format correlation data into a wide matrix for heatmap input
library(pheatmap)# Generates customizable heatmaps; used to visualize the Spearman correlation matrix between all replicate-fraction combinations
library(glue) # Easy string construction with embedded variables and functions
library(ggplot2) # Various plots (elegant, versatile and easy to use)
library(dplyr) # For conditional logic with case_when inside plot aesthetics, also for filtering and selecting relevant data 

# Clustering
library(dbscan) # Necessary for scaling in order to later run density based clustering

# Chunk Additional Data 
library(httr) # Making API requests to UniProt (GET requests and response handling)
library(jsonlite) # Parsing JSON responses from the UniProt API 
library(pbapply) # Applying functions with a progress bar
library(data.table) # Fast loading of large tables (CORUM) and simple grouping / restructuring of data using [i, j, by] syntax

#Loading MS-Data

MS_Table <- read.table("RDeeP_HeLa_Mitosis.csv", header = TRUE, row.names = 1, sep = ";")

Data Cleanup

Description of the Data - Lina

Goal:

  • Get a feeling for the dataset and decide on necessary clean-up steps
glue("Number of proteins analysed /rows : {nrow(MS_Table)}")
## Number of proteins analysed /rows : 7159
glue("Number of columns: {ncol(MS_Table)} (25 fractions with each 3 replicates for Ctrl and RNase treatment)")
## Number of columns: 150 (25 fractions with each 3 replicates for Ctrl and RNase treatment)
glue("Classification of variables: {unique(sapply(MS_Table, class))}")
## Classification of variables: numeric
glue("Any missing values: {any(is.na(MS_Table))}")
## Any missing values: FALSE
glue("Overall minimum intensity: {min(MS_Table)}")
## Overall minimum intensity: 0
glue("Overall maximum intensity: {max(MS_Table)}")
## Overall maximum intensity: 1514642849

#Normalization - Cihan, Sofia,Lina ### Goal: - Generate a new dataframe with replicate-averaged intensity values,scaled so that each protein’s total intensity sums to 100 across all fractions (for Ctrl and RNase) - Build a function to plot protein profiles using the normalized data
- Confirm experimental reproducibility by comparing intensity profiles across replicates

Normalization of triplicates - Lina

# Create a metadata dataframe with factor columns that map each MS_Table column to its experimental treatment, replicate, and fraction number
treatment <- factor(rep(c("Ctrl", "RNase"), each = 3, length.out = 150))
replicate <- factor(rep(c("Ctrl_Rep1", "Ctrl_Rep2", "Ctrl_Rep3", "RNase_Rep1","RNase_Rep2", "RNase_Rep3"),25))
fraction <- factor(rep(paste0("Fraction_", 1:25), each = 6))

data <- data.frame(rownames = colnames(MS_Table),treatment = treatment, replicate = replicate, fraction = fraction)

# Iterate over all fraction levels: for each fraction, select replicate columns of Ctrl and RNase, compute row-wise means and store them in previously created list
average.list <- list()

for (f in levels(fraction)) {
  
  cols.Ctrl <- which(fraction == f & treatment == "Ctrl") 
  average.Ctrl <- rowMeans(MS_Table[,cols.Ctrl]) 
  average.list[[paste0("Ctrl_",f)]] <- average.Ctrl 
  
  cols.RNase <- which(fraction == f & treatment == "RNase")
  average.RNase <- rowMeans(MS_Table[,cols.RNase])
  average.list[[paste0("RNase_",f)]] <- average.RNase
}

# Convert the list of averages to a data frame and restore rownames
MS_Table_Averages <- as.data.frame(average.list) 
rownames(MS_Table_Averages) <- rownames(MS_Table) 

# Reorder columns: alphabetic order of 'levels()' disrupts correct fraction sequence,so construct correct order manually (Ctrl_1 to Ctrl_25, then RNase_1 to RNase_25)
fractions <- paste0("Fraction_", 1:25)
ordered_names <- as.vector(rbind(
  paste0("Ctrl_", fractions),
  paste0("RNase_", fractions)
))

MS_Table_Averages <- MS_Table_Averages[, ordered_names]

Plot showing variance of our data - Sofia

library(dplyr)
# -------------- PART 1: Plot for standard deviation for all 7159 proteins from MS_Table_Averages for each fraction for the CTRL condition ----------



# 1. Select Ctrl columns using grep
ctrl_cols <- grep("^Ctrl_Fraction_", colnames(MS_Table_Averages), value = TRUE)

# 2. Randomly pick 25 proteins (rows)
set.seed(123)
selected <- sample(rownames(MS_Table_Averages), 25)
data <- MS_Table_Averages[selected, ctrl_cols]

# 3. Calculate mean and SE per protein (row-wise)
mean_vals <- apply(data, 1, mean, na.rm = TRUE)
se_vals <- apply(data, 1, function(x) sd(x, na.rm = TRUE) / sqrt(length(na.omit(x))))

# 4. Identify outliers for each protein
outliers <- lapply(1:nrow(data), function(i) {
  vals <- as.numeric(data[i, ])
  q1 <- quantile(vals, 0.25, na.rm = TRUE)
  q3 <- quantile(vals, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  out_idx <- which(vals < (q1 - 1.5 * iqr) | vals > (q3 + 1.5 * iqr))
  if (length(out_idx) > 0) {
    data.frame(Protein = rownames(data)[i],
               Intensity = vals[out_idx])
  }
})
outlier_df <- do.call(rbind, outliers)

# 5. Create summary dataframe
summary_df <- data.frame(
  Protein = rownames(data),
  Mean = mean_vals,
  SE = se_vals
)

# 6. Plot (no log scale)
ggplot(summary_df, aes(x = Protein, y = Mean)) +
  geom_bar(stat = "identity", fill = "gray70") +
  geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.3) +
  geom_point(data = outlier_df, aes(x = Protein, y = Intensity),
             color = "darkred", size = 2, position = position_jitter(width = 0.2)) +
  labs(title = "Mean Intensity per Protein",
       y = "Mean Intensity", x = "") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Normalization to 100 - Cihan

Goal:

  • Create a new dataframe in which the values for each protein are scaled so that the distribution within the Ctrl and RNase conditions each sums to 100
  • For less variance between the fractions –> This allows comparison of distribution independent of absolute abundance
# Split up Control and RNAse values 
ctrl_cols <- grep("^Ctrl_", colnames(MS_Table_Averages))  
rnase_cols <- grep("^RNase_", colnames(MS_Table_Averages))

# Normalization of each group to 100 
ctrl_norm <- MS_Table_Averages[, ctrl_cols] / rowSums(MS_Table_Averages[, ctrl_cols]) * 100
rnase_norm <- MS_Table_Averages[, rnase_cols] / rowSums(MS_Table_Averages[, rnase_cols]) * 100

# Combine in one table 
MS_Table_Norm <- cbind(ctrl_norm, rnase_norm)
MS_Table_Norm <- MS_Table_Norm[, colnames(MS_Table_Averages)]
head(round(MS_Table_Norm, 2))
##             Ctrl_Fraction_1 RNase_Fraction_1 Ctrl_Fraction_2 RNase_Fraction_2
## 1433B_HUMAN            0.05             0.04            0.81             0.96
## 1433E_HUMAN            0.04             0.03            0.60             0.68
## 1433F_HUMAN            0.08             0.07            0.86             0.98
## 1433G_HUMAN            0.06             0.05            0.66             0.76
## 1433S_HUMAN            0.06             0.05            0.93             1.10
## 1433T_HUMAN            0.07             0.06            1.05             1.20
##             Ctrl_Fraction_3 RNase_Fraction_3 Ctrl_Fraction_4 RNase_Fraction_4
## 1433B_HUMAN            4.18             4.27           11.92            11.36
## 1433E_HUMAN            4.18             4.33           12.05            11.48
## 1433F_HUMAN            4.45             4.52           13.56            12.40
## 1433G_HUMAN            3.57             3.55           12.58            11.68
## 1433S_HUMAN            5.11             5.26           14.03            13.03
## 1433T_HUMAN            4.44             4.30           12.21            11.43
##             Ctrl_Fraction_5 RNase_Fraction_5 Ctrl_Fraction_6 RNase_Fraction_6
## 1433B_HUMAN           40.33            41.46           23.76            17.25
## 1433E_HUMAN           42.18            42.24           20.61            15.32
## 1433F_HUMAN           41.62            42.27           19.63            14.90
## 1433G_HUMAN           37.14            37.34           23.55            17.38
## 1433S_HUMAN           40.60            42.05           21.18            16.05
## 1433T_HUMAN           46.08            46.48           18.15            13.57
##             Ctrl_Fraction_7 RNase_Fraction_7 Ctrl_Fraction_8 RNase_Fraction_8
## 1433B_HUMAN            7.65             9.45            4.30             5.52
## 1433E_HUMAN            8.15             9.69            3.65             4.62
## 1433F_HUMAN            7.18             8.62            4.24             5.21
## 1433G_HUMAN            8.15            10.30            4.56             5.66
## 1433S_HUMAN            6.07             7.36            3.13             3.78
## 1433T_HUMAN            6.86             8.47            3.71             4.76
##             Ctrl_Fraction_9 RNase_Fraction_9 Ctrl_Fraction_10 RNase_Fraction_10
## 1433B_HUMAN            1.92             2.71             0.86              1.78
## 1433E_HUMAN            2.00             2.71             1.30              2.38
## 1433F_HUMAN            1.93             2.62             1.03              2.04
## 1433G_HUMAN            2.39             3.30             1.36              2.59
## 1433S_HUMAN            1.84             2.44             0.99              1.98
## 1433T_HUMAN            1.65             2.27             0.83              1.75
##             Ctrl_Fraction_11 RNase_Fraction_11 Ctrl_Fraction_12
## 1433B_HUMAN             0.46              0.77             0.43
## 1433E_HUMAN             0.63              0.99             0.42
## 1433F_HUMAN             0.59              0.95             0.57
## 1433G_HUMAN             0.67              1.08             0.65
## 1433S_HUMAN             0.55              0.89             0.53
## 1433T_HUMAN             0.41              0.65             0.39
##             RNase_Fraction_12 Ctrl_Fraction_13 RNase_Fraction_13
## 1433B_HUMAN              0.53             0.36              0.84
## 1433E_HUMAN              0.50             0.59              1.30
## 1433F_HUMAN              0.71             0.52              1.16
## 1433G_HUMAN              0.79             0.62              1.32
## 1433S_HUMAN              0.65             0.53              1.17
## 1433T_HUMAN              0.50             0.42              0.91
##             Ctrl_Fraction_14 RNase_Fraction_14 Ctrl_Fraction_15
## 1433B_HUMAN             0.33              0.61             0.41
## 1433E_HUMAN             0.41              0.73             0.53
## 1433F_HUMAN             0.41              0.70             0.42
## 1433G_HUMAN             0.53              0.89             0.58
## 1433S_HUMAN             0.46              0.78             0.55
## 1433T_HUMAN             0.36              0.61             0.51
##             RNase_Fraction_15 Ctrl_Fraction_16 RNase_Fraction_16
## 1433B_HUMAN              0.73             0.48              0.58
## 1433E_HUMAN              0.96             0.62              0.75
## 1433F_HUMAN              0.76             0.48              0.54
## 1433G_HUMAN              1.02             0.66              0.78
## 1433S_HUMAN              0.95             0.61              0.72
## 1433T_HUMAN              0.91             0.57              0.66
##             Ctrl_Fraction_17 RNase_Fraction_17 Ctrl_Fraction_18
## 1433B_HUMAN             0.58              0.51             0.41
## 1433E_HUMAN             0.56              0.50             0.56
## 1433F_HUMAN             0.71              0.63             0.52
## 1433G_HUMAN             0.69              0.62             0.65
## 1433S_HUMAN             0.65              0.61             0.58
## 1433T_HUMAN             0.74              0.63             0.51
##             RNase_Fraction_18 Ctrl_Fraction_19 RNase_Fraction_19
## 1433B_HUMAN              0.30             0.17              0.12
## 1433E_HUMAN              0.35             0.30              0.20
## 1433F_HUMAN              0.37             0.29              0.20
## 1433G_HUMAN              0.44             0.26              0.18
## 1433S_HUMAN              0.41             0.33              0.24
## 1433T_HUMAN              0.35             0.26              0.18
##             Ctrl_Fraction_20 RNase_Fraction_20 Ctrl_Fraction_21
## 1433B_HUMAN             0.19              0.08             0.16
## 1433E_HUMAN             0.26              0.10             0.17
## 1433F_HUMAN             0.27              0.11             0.26
## 1433G_HUMAN             0.29              0.12             0.16
## 1433S_HUMAN             0.39              0.14             0.29
## 1433T_HUMAN             0.20              0.08             0.20
##             RNase_Fraction_21 Ctrl_Fraction_22 RNase_Fraction_22
## 1433B_HUMAN              0.06             0.07              0.03
## 1433E_HUMAN              0.06             0.06              0.02
## 1433F_HUMAN              0.10             0.12              0.05
## 1433G_HUMAN              0.07             0.08              0.03
## 1433S_HUMAN              0.13             0.14              0.06
## 1433T_HUMAN              0.08             0.18              0.09
##             Ctrl_Fraction_23 RNase_Fraction_23 Ctrl_Fraction_24
## 1433B_HUMAN             0.05              0.02             0.09
## 1433E_HUMAN             0.04              0.02             0.07
## 1433F_HUMAN             0.08              0.03             0.15
## 1433G_HUMAN             0.05              0.02             0.09
## 1433S_HUMAN             0.09              0.04             0.28
## 1433T_HUMAN             0.06              0.02             0.11
##             RNase_Fraction_24 Ctrl_Fraction_25 RNase_Fraction_25
## 1433B_HUMAN              0.03             0.02              0.01
## 1433E_HUMAN              0.02             0.02              0.01
## 1433F_HUMAN              0.04             0.04              0.01
## 1433G_HUMAN              0.03             0.02              0.01
## 1433S_HUMAN              0.07             0.07              0.03
## 1433T_HUMAN              0.03             0.04              0.02
# Verify the calculation with one example; the row sum should be ~200 (Ctrl 100% + RNAse 100%)
rowSums(MS_Table_Norm["1433B_HUMAN", ]) 
## 1433B_HUMAN 
##         200

Plotting a Protein - Lina

Goal:

  • Build a function to plot normalized intensity values of Ctrl and RNase across all fractions
plot_protein <- function(x) {
  
  # Extract data, define all variables and range of x- and y-axis
  protein_of_interest <- x 
  protein_row <- MS_Table_Norm[rownames(MS_Table_Norm) == protein_of_interest, ]
  ctrl_values <- as.numeric(protein_row[seq(1, 49, by=2)])
  rnase_values <- as.numeric(protein_row[seq(2, 50, by=2)])
  fractions <- 1:25
  max_val <- max(ctrl_values, rnase_values, na.rm = TRUE)
  ylim_range <- c(0, max_val * 1.2)

  # Core plot function using crtl_values
  plot(fractions, ctrl_values, type="o", pch=20, lty=1, lwd = 1.5,  col="forestgreen", ylim= ylim_range,
       xlab="Fraction", ylab="Normalized Intensity", main= protein_of_interest, axes = FALSE)
       
  # Add line for rnase_values
  lines(fractions, rnase_values, type="o",pch =20, lty=1, lwd = 1.5,  col="firebrick3")
  
  # Color area under the curve
  polygon(c(fractions, rev(fractions)), 
          c(ctrl_values, rep(0, length(ctrl_values))), 
          col=adjustcolor("forestgreen", alpha.f=0.1), border=NA)
  
  polygon(c(fractions, rev(fractions)), 
          c(rnase_values, rep(0, length(rnase_values))), 
          col=adjustcolor("firebrick3", alpha.f=0.1), border=NA)
  
  # Add x- and y-axis with correct labels
  axis(1, at = 1:25, labels = 1:25, cex.axis = 0.7)
  axis(2, cex.axis = 0.7, las =2)
  grid(nx = NULL, ny = NULL, col = "lightgray", lty = "dotted", lwd = 0.8)

  # Add legend
  legend("topright", legend=c("Ctrl", "RNase"), col=c("forestgreen", "firebrick3"), 
         lty=1, lwd = 1.5,  pch=20, bg = "white", bty = "o", box.col = NA, horiz = TRUE)
}

# Example
plot_protein("RL27_HUMAN") 

Reproducibility Analysis - Cihan, Sofia

Goal:

  • Assess reproducibility of MS-based protein measurements for both treatments
  • Visualize how consistent protein ranking patterns are across all replicates and fractions
  • We expect a diagonal showing our r = 1, while the other combinations should show lower correlation
# Create empty dataframe for correlation 
full_correlation_results <- data.frame(
  Treatment = character(),
  Replicate_1 = character(),
  Fraction_1 = character(),
  Replicate_2 = character(),
  Fraction_2 = character(),
  Spearman_r = numeric(),
  stringsAsFactors = FALSE
)

# Iterate over both Ctrl and RNase treatments 
for (treatment in c("Ctrl", "RNase")) {
  
  # Create all possible rep-fraction combinations
  combo1 <- expand.grid(Rep = 1:3, Fraction = 1:25)
  combo2 <- expand.grid(Rep = 1:3, Fraction = 1:25)
  
  # Iterate for every possible combination
  for (i in 1:nrow(combo1)) {
    for (j in 1:nrow(combo2)) {
      
      rep1 <- combo1$Rep[i]
      frac1 <- combo1$Fraction[i]
      rep2 <- combo2$Rep[j]
      frac2 <- combo2$Fraction[j]
      
      # Find right column position 
      col1 <- which(replicate == paste0(treatment, "_Rep", rep1) & fraction == paste0("Fraction_", frac1))
      col2 <- which(replicate == paste0(treatment, "_Rep", rep2) & fraction == paste0("Fraction_", frac2))
      
      # Calculate Spearman correlation
      cor_value <- cor(MS_Table[, col1], MS_Table[, col2], method = "spearman", use = "complete.obs")
      
      # Save results in dataframe
      full_correlation_results <- rbind(full_correlation_results, data.frame(
        Treatment = treatment,
        Replicate_1 = paste0("Rep", rep1),
        Fraction_1 = paste0("Fraction_", frac1),
        Replicate_2 = paste0("Rep", rep2),
        Fraction_2 = paste0("Fraction_", frac2),
        Spearman_r = cor_value
      ))
    }
  }
}


head(full_correlation_results)
##   Treatment Replicate_1 Fraction_1 Replicate_2 Fraction_2 Spearman_r
## 1      Ctrl        Rep1 Fraction_1        Rep1 Fraction_1  1.0000000
## 2      Ctrl        Rep1 Fraction_1        Rep2 Fraction_1  0.9997441
## 3      Ctrl        Rep1 Fraction_1        Rep3 Fraction_1  0.9995977
## 4      Ctrl        Rep1 Fraction_1        Rep1 Fraction_2  0.7563490
## 5      Ctrl        Rep1 Fraction_1        Rep2 Fraction_2  0.7504747
## 6      Ctrl        Rep1 Fraction_1        Rep3 Fraction_2  0.7501923
# Create column for each axis 
full_correlation_results <- full_correlation_results %>%
   mutate(Row = paste0(Replicate_1, "_F", gsub("Fraction_", "", Fraction_1)),
         Col = paste0(Replicate_2, "_F", gsub("Fraction_", "", Fraction_2)))

# We select treatment RNase for our first heatmap
treatment_to_plot_RNase <- "RNase"

# Filter table for selected treatment: RNase
heatmap_data_RNase <- full_correlation_results %>%
  filter(Treatment == treatment_to_plot_RNase) %>%
  select(Row, Col, Spearman_r)

# Convert into matrix 
heatmap_matrix_RNase <- heatmap_data_RNase %>%
  pivot_wider(names_from = Col, values_from = Spearman_r, values_fill = NA) %>%
  column_to_rownames("Row") %>%
  as.matrix()

# Create heatmap for RNase treatment
print(pheatmap(heatmap_matrix_RNase,
         main = paste("Spearman Correlation Heatmap -", treatment_to_plot_RNase),
         cluster_rows = FALSE, 
         cluster_cols = FALSE, 
         show_rownames = TRUE, 
         show_colnames = TRUE,
         fontsize_row = 4,
         fontsize_col = 4))

# We select treatment Ctrl for our second heatmap
treatment_to_plot_Ctrl <- "Ctrl"

# Filter table for selected treatment: Ctrl
heatmap_data_Ctrl <- full_correlation_results %>%
  filter(Treatment == treatment_to_plot_Ctrl) %>%
  select(Row, Col, Spearman_r)

# Convert into matrix 
heatmap_matrix_Ctrl <- heatmap_data_Ctrl %>%
  pivot_wider(names_from = Col, values_from = Spearman_r, values_fill = NA) %>%
  column_to_rownames("Row") %>%
  as.matrix()

# Create heatmap for Ctrl treatment
print(pheatmap(heatmap_matrix_Ctrl,
         main = paste("Spearman Correlation Heatmap -", treatment_to_plot_Ctrl),
         cluster_rows = FALSE, 
         cluster_cols = FALSE, 
         show_rownames = TRUE, 
         show_colnames = TRUE,
         fontsize_row = 4,
         fontsize_col = 4))

Data Analysis

Goal:

  • Identification of RNA-Dependent Proteins
  • Identification of RBPs specifically active during mitosis

Peak Detection in normalized Ctrl and RNase profiles - Lina

Goal:

  • Identify peaks in normalized intensity profiles (Ctrl and RNase)
  • Extract number of peaks, their positions and heights

NOTE: Peak data is not used for RBP identification. This step is purely descriptive. However, Peak_Data is later used for clustering and linear regression analysis.

# Define x axis
x <- 1:25

# Build function for peak detection using slope between fractions
# Detects internal peaks above a given threshold (default: 3% of max intensity)
# Note: Cannot detect peaks at positions 1 and 25 (manually handled below)
find_peaks <- function(y, threshold = max(y) * 0.03) { 
  peaks <- which(diff(sign(diff(y))) == -2) + 1 # (+1 shifts index forward)
  peaks[y[peaks]>= threshold] 
}

# Initialize list to collect results per protein and iterate over all proteins (rows in MS_Table_Norm)
peak_results <- list()

for (p in seq_len(nrow(MS_Table_Norm))) {
  
  # Extract normalized values for Ctrl and RNase
  row_vals <- as.numeric(MS_Table_Norm [p,])
  ctrl_vals <- row_vals[seq(1, 50, by =2)]
  rnase_vals <- row_vals[seq(2, 50, by=2)]
  
  results_row <- list()
  
  # Process both conditions in one loop
  for (cond in c("Ctrl", "RNase")) {
    y <- if (cond == "Ctrl") ctrl_vals else rnase_vals
    threshold <- max(y, na.rm = TRUE) * 0.03
    
    peaks <- find_peaks(y)
    
    # Manually check for peaks at positions 1 and 25
    if (length(peaks) == 0) {
      peaks <- numeric(0)
      }
    if (!is.na(y[1]) && !is.na(y[2]) && y[1] > y[2] && y[1] >= threshold) {
      peaks <- c(1, peaks)
      }
    if (!is.na(y[25]) && !is.na(y[24]) && y[25] > y[24] && y[25] >= threshold) {
      peaks <- c(peaks, 25)
    }
    # Fallback: If no peaks are detected at all, use global maximum
    if(length(peaks) == 0) {
      peaks <- which.max(y)
      }
    
    # Limit number of stored peaks to max. 6
    n_peaks <- min(length(peaks), 6)
    
    # Extract peak height and peak position
    y_peaks <- y[peaks]
    x_peaks <- x[peaks]

    # Store data for current protein (Number of peaks, peak heights and peak positions)
    results_row[[paste0("n_peaks_", cond)]] <- n_peaks
    
    for(n in seq_len(n_peaks)) {
    results_row[[paste0("peak", n, "_height_", cond)]] <- y_peaks[n]
    results_row[[paste0("peak", n, "_position_", cond)]] <- x_peaks[n]
    }
  }
  # Store result for current protein
  peak_results[[p]] <- results_row
}
## Warning in max(y, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(y, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(y, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(y, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
# Fill missing values with NAs to allow binding all rows together
rows_df <- lapply(peak_results, function(x){
  columns <- c("peak1_height", "peak1_position", 
               "peak2_height", "peak2_position", 
               "peak3_height", "peak3_position", 
               "peak4_height", "peak4_position", 
               "peak5_height", "peak5_position", 
               "peak6_height", "peak6_position")
  for (cond in c("Ctrl", "RNase")) {
    for (c in columns) {
      key <- paste0(c, "_", cond)
      if(!(key %in% names(x))) {
        x[[key]] <- NA
      }
    }
  }
  as.data.frame(x)
})

# Combine all protein results into one data frame 
Peak_Data <- do.call(rbind, rows_df) #do.call bind used because rows_df is a list of dataframes
rownames(Peak_Data) <- rownames(MS_Table_Norm)

# Reorder columns: first n_peaks, then all peak heights and positions (Ctrl & RNase)
order <- c("n_peaks_Ctrl", "n_peaks_RNase")

for (i in 1:6) {
  order <- c(order, 
                 paste0("peak", i, "_height_Ctrl"),
                 paste0("peak", i, "_position_Ctrl"),
                 paste0("peak", i, "_height_RNase"),
                 paste0("peak", i, "_position_RNase"))
}

Peak_Data <- Peak_Data[, order]

Shift Characteristics - Lina, Sofia, Cihan

Goal: - Quantify and summarize positional shifts in protein distribution between Ctrl and RNase conditions using center of mass (CoM) calculation as a weighted average - Further shift characterization with: Shift distance: difference between CoM_Ctrl and CoM_RNase Shift direction: +1 = left-shift, –1 = right-shift, 0 = no shift

NOTE: CoM-based shift calculations is later repeated per replicate and used for statistical testing (t-test) to identify significantly shifting RBPs.

# Initialize result vectors (one value per protein)
CoM_Ctrl <- numeric(nrow(MS_Table_Norm))
CoM_RNase <- numeric(nrow(MS_Table_Norm))
shift_distance <- numeric(nrow(MS_Table_Norm))
shift_direction <- numeric(nrow(MS_Table_Norm))

# Iterate over all proteins and calculate shift using center of mass
for (p in seq_len(nrow(MS_Table_Norm))) {
  
   # Extract normalized intensities for Ctrl and RNase
  row_vals <- as.numeric(MS_Table_Norm [p,])
  ctrl_vals <- row_vals[seq(1, 50, by =2)]
  rnase_vals <- row_vals[seq(2, 50, by=2)]
  
  # Calculate Center of Mass for both conditions
  CoM_Ctrl[p] <- sum(ctrl_vals * x) / sum(ctrl_vals)
  CoM_RNase[p] <- sum(rnase_vals * x) / sum(rnase_vals)
  
  # Compute shift distance and direction
  shift_distance[p] <- CoM_Ctrl[p] - CoM_RNase[p] 
  shift_direction[p] <- sign(shift_distance[p]) 
}

# Store results into a single dataframe
Shift_Data <- data.frame(
  CoM_Ctrl = CoM_Ctrl,
  CoM_RNase = CoM_RNase,
  shift_distance = shift_distance,
  shift_direction = shift_direction,
  row.names = rownames(MS_Table_Norm))

T-Test

1. Normalization of whole MS_Table to 100 - Cihan

Goal:

  • Normalize the values of each replicate within each treatment (Ctrl and RNase), so that the total across all 25 fractions equals 100 per replicate.
  • This ensures comparability between replicates by removing global intensity differences and prepares the data for center of mass (CoM) calculation and statistical testing (t-test) later on
# Split up all Ctrl and RNAse values and their individual Reps 
ctrl_cols_t1 <- grep("_Ctrl_Rep1", colnames(MS_Table))
ctrl_cols_t2 <- grep("_Ctrl_Rep2", colnames(MS_Table))
ctrl_cols_t3 <- grep("_Ctrl_Rep3", colnames(MS_Table))
RNase_cols_t1 <- grep("_RNase_Rep1", colnames(MS_Table))
RNase_cols_t2 <- grep("_RNase_Rep2", colnames(MS_Table))
RNase_cols_t3 <- grep("_RNase_Rep3", colnames(MS_Table))

# Normalization of each group to 100 
ctrl_norm_t1 <- MS_Table[, ctrl_cols_t1] / rowSums(MS_Table[, ctrl_cols_t1]) * 100
ctrl_norm_t2 <- MS_Table[, ctrl_cols_t2] / rowSums(MS_Table[, ctrl_cols_t2]) * 100
ctrl_norm_t3 <- MS_Table[, ctrl_cols_t3] / rowSums(MS_Table[, ctrl_cols_t3]) * 100
RNase_norm_t1 <- MS_Table[, RNase_cols_t1] / rowSums(MS_Table[, RNase_cols_t1]) * 100
RNase_norm_t2 <- MS_Table[, RNase_cols_t2] / rowSums(MS_Table[, RNase_cols_t2]) * 100
RNase_norm_t3 <- MS_Table[, RNase_cols_t3] / rowSums(MS_Table[, RNase_cols_t3]) * 100

# Combine in one table 
Norm_Data_for_t <- cbind(ctrl_norm_t1, RNase_norm_t1, ctrl_norm_t2, RNase_norm_t2, ctrl_norm_t3, RNase_norm_t3)

# In gewünschter Reihenfolge ordnen
fractions <- 1:25
reps <- 1:3
conditions <- c("Ctrl", "RNase") 


ordered_names <- c()

for (fraction in fractions) {
  for (rep in reps) {
    for (condition in conditions) {
      name <- paste0("Fraction", fraction, "_", condition, "_Rep", rep)
      ordered_names <- c(ordered_names, name)
    }
  }
}

Norm_Data_for_t <- Norm_Data_for_t[, ordered_names]

# Verify the calculation with one example; the row sum should be ~600 (3*Ctrl 100% + 3*RNAse 100%)
rowSums(Norm_Data_for_t["1433B_HUMAN", ]) 
## 1433B_HUMAN 
##         600

2. Calculation of Center of mass (CoM) - Sofia

Goal:

  • Calculating center of mass (CoM) for all replicates and treatments
# Setup
fractions <- 1:25
conditions <- c("Ctrl", "RNase")
replicates <- c("Rep1", "Rep2", "Rep3")

results_com <- list()

# Loop through conditions and replicates ( runs 6 times for all combinations)
for (cond in conditions) {
  for (rep in replicates) {
    
    # Single regex pattern to match all 25 fraction columns for this condition & replicate
    pattern <- paste0("^Fraction[0-9]+_", cond, "_", rep, "$") # defines a pattern 
    cols <- grep(pattern, colnames(Norm_Data_for_t), value = TRUE) # chooses pattern from colums (matching)

    # Subset and sort by fraction number
    subdata <- Norm_Data_for_t[, cols] # chooses all columns 
    subdata <- subdata[, order(as.numeric(gsub("Fraction(\\d+)_.*", "\\1", cols)))] # putting into the right order as Com assumes coloum 1 = fraction 1 

    # Center of mass calculation
    com <- apply(subdata, 1, function(x) sum(x * fractions) / sum(x)) # function (across rows)

    # Store result
    results_com[[paste(cond, rep, sep = "_")]] <- com # adds result from com to each cond/ rep, separates with _ 
  }
}

# Convert list to data frame
CoM_Data_for_t <- as.data.frame(results_com) 

rownames(CoM_Data_for_t) <- rownames(Norm_Data_for_t) # apply rownames from df to new df 

head(CoM_Data_for_t)
##             Ctrl_Rep1 Ctrl_Rep2 Ctrl_Rep3 RNase_Rep1 RNase_Rep2 RNase_Rep3
## 1433B_HUMAN  5.925425  5.823557  5.845832   5.940085   6.144829   5.911295
## 1433E_HUMAN  6.036753  5.935902  5.902976   6.057458   6.277645   6.003817
## 1433F_HUMAN  5.973098  5.934650  5.904769   5.990386   6.238765   5.950127
## 1433G_HUMAN  6.166125  6.105311  6.063888   6.264795   6.505422   6.151889
## 1433S_HUMAN  5.988280  5.998226  5.919392   5.969546   6.224258   5.918595
## 1433T_HUMAN  5.906721  5.813788  5.825574   5.871993   6.109628   5.907854

3. Calculating Shift-Values for T-Test - Lina

Goal:

  • Calculate deviation between CoM of Ctrl and CoM of RNase and store in df for t-test
# For each protein, subtract the RNase CoM from the corresponding Ctrl CoM across all three replicates to obtain replicate-wise shift distances.
# Columns 1–3 = Ctrl_Rep1–3, Columns 4–6 = RNase_Rep1–3 (in CoM_Data_for_t)
Shift_Data_for_t <- data.frame(
  shift_distance_rep1 = CoM_Data_for_t[, 1] - CoM_Data_for_t[, 4],
  shift_distance_rep2 = CoM_Data_for_t[, 2] - CoM_Data_for_t[, 5],
  shift_distance_rep3 = CoM_Data_for_t[, 3] - CoM_Data_for_t[, 6],
  row.names = rownames(CoM_Data_for_t)
)

# Count how many missing values (NA) are present across all shift distances. (proteins where CoM could not be calculated in at least one replicate)
sum(is.na(Shift_Data_for_t))
## [1] 13

4. T-Test - Cihan

Goal:

  • Statistically test whether proteins show a significant left-shift in their center of mass (CoM) distribution in RNase vs. Ctrl, indicating RNA-dependent localization.
  • This is done by:
    1. Applying the Shapiro-Wilk test to firstly ensure, that shift values of the replicates are normally distributed before running the t-test.
    2. Testing if the shift distance is significantly greater than 1 using a one-sided t-test.
  • Significant hits are categorized as RBPs.
# Create empty columns for statistical test results 
Shift_Data$shapiro_p <- NA
Shift_Data$normal_distributed <- NA
Shift_Data$p_value_ttest_filtered <- NA
Shift_Data$significant_ttest_filtered <- NA

# Iterate over all proteins 
for (protein in rownames(Shift_Data_for_t)) {
  
  # Extract shift distances for each replicate
  shift_values <- as.numeric(Shift_Data_for_t[protein, c("shift_distance_rep1", "shift_distance_rep2", "shift_distance_rep3")])
  
  # Make sure that no NAs are present
  if (all(!is.na(shift_values))) {
    
    # Check if all values are identical (no variance)
    if (length(unique(shift_values)) == 1) {
      
      # Set normal distribution to TRUE if no variance is present
      Shift_Data[protein, "shapiro_p"] <- NA
      Shift_Data[protein, "normal_distributed"] <- TRUE
      
    } else {
      
      # Perform Shapiro-Wilk test to check for normal distribution
      shapiro_result <- shapiro.test(shift_values)
      Shift_Data[protein, "shapiro_p"] <- shapiro_result$p.value
      Shift_Data[protein, "normal_distributed"] <- shapiro_result$p.value >= 0.05
    }
    
    # Perform one-sided t-test if data is normal distributed
    if (Shift_Data[protein, "normal_distributed"] == TRUE) {
      
      test_result <- t.test(shift_values, mu = 1, alternative = "greater")
      
      Shift_Data[protein, "p_value_ttest_filtered"] <- test_result$p.value
      Shift_Data[protein, "significant_ttest_filtered"] <- ifelse(test_result$p.value < 0.05, TRUE, FALSE)
      
    } else {
      # No t-test applied if data is not normal distributed
      Shift_Data[protein, "p_value_ttest_filtered"] <- NA
      Shift_Data[protein, "significant_ttest_filtered"] <- NA
    }
  }
}

# Summary of proteins regarding normal distribution
table(Shift_Data$normal_distributed, useNA = "ifany")
## 
## FALSE  TRUE  <NA> 
##   304  6848     7
# Summary of proteins regarding significance of t-test
table(Shift_Data$significant_ttest_filtered, useNA = "ifany")
## 
## FALSE  TRUE  <NA> 
##  6054   794   311
head(Shift_Data)
##             CoM_Ctrl CoM_RNase shift_distance shift_direction  shapiro_p
## 1433B_HUMAN 5.865905  5.989765    -0.12385993              -1 0.29639961
## 1433E_HUMAN 5.955975  6.103139    -0.14716379              -1 0.46248310
## 1433F_HUMAN 5.936312  6.044335    -0.10802358              -1 0.16974567
## 1433G_HUMAN 6.110018  6.292653    -0.18263513              -1 0.05750234
## 1433S_HUMAN 5.965632  6.022448    -0.05681575              -1 0.12563269
## 1433T_HUMAN 5.848752  5.952232    -0.10347998              -1 0.68093934
##             normal_distributed p_value_ttest_filtered
## 1433B_HUMAN               TRUE              0.9965351
## 1433E_HUMAN               TRUE              0.9965445
## 1433F_HUMAN               TRUE              0.9967239
## 1433G_HUMAN               TRUE              0.9963788
## 1433S_HUMAN               TRUE              0.9973063
## 1433T_HUMAN               TRUE              0.9962720
##             significant_ttest_filtered
## 1433B_HUMAN                      FALSE
## 1433E_HUMAN                      FALSE
## 1433F_HUMAN                      FALSE
## 1433G_HUMAN                      FALSE
## 1433S_HUMAN                      FALSE
## 1433T_HUMAN                      FALSE

Plot T-Test Results - Lina

###Goal: Plot t-test results by visualizing CoM positions in Ctrl vs. RNase

# Create scatter plot of CoM values in Ctrl (x) vs. RNase (y) for all proteins
# Color points based on t-test result (significant or not)
ggplot(Shift_Data, aes(x = `CoM_Ctrl`, y = `CoM_RNase`, color = case_when(
  `significant_ttest_filtered` ~ "Significant Left Shift (RBP)",
    TRUE ~ "No significant Left Shift"
  ))) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray20") +
  scale_color_manual(name = "RBP Identity",values = c(
    "Significant Left Shift (RBP)" = "darkred",
    "No significant Left Shift" = "gray70"
  ))+
  labs(title = "Center of Mass in Ctrl and RNase: T-Test Results",
       x = "Center of Mass - Control",
       y = "Center of Mass - RNase") +
  theme_minimal()+
  theme(
    legend.position = c(0.02, 0.98),
    legend.justification = c("left", "top"),
    legend.background = element_rect(fill = "white",color = "gray70")
  )
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

Validation of T-Test with UniProt Data - Lina

Goal:

Assess the validity of our T-test-based identification of RNA-binding proteins (RBPs), by comparing the results with annotated human RBPs from UniProt.

This includes: - Overlap analysis between our tested proteins and known UniProt RBPs - Calculation of a hit rate (percentage of known RBPs detected as T-test positives) - Validation with literature-supported positive and negative control proteins

# Load annotated RNA-binding proteins (Keyword KW-0694) from UniProt (human-specific)
uniprot_rbps <- read.delim("uniprotkb_KW_0694_AND_model_organism_96_2025_07_04.tsv", 
                           header = TRUE, sep = "\t", stringsAsFactors = FALSE)

# Extract UniProt entry names (used for matching)
known_rbps <- uniprot_rbps$Entry.Name
glue("Number of known RBPs in UniProt: {length(known_rbps)}")
## Number of known RBPs in UniProt: 3114
# Extract RBP candidates identified by T-test
t_test_positives <- rownames(
  Shift_Data[Shift_Data$significant_ttest_filtered == TRUE & 
               !is.na(Shift_Data$significant_ttest_filtered), ])

# Determine overlap: known RBPs present in our dataset
overlap_MS_Table <- intersect(rownames(MS_Table), known_rbps)
glue("Number of UniProt RBPs tested with R-DeeP: {length(overlap_MS_Table)}")
## Number of UniProt RBPs tested with R-DeeP: 543
# Determine overlap: known RBPs that were also T-test positive
overlap_t_test <- intersect(t_test_positives, known_rbps)
glue("Number identified as RBPs (T-Test positive): {length(overlap_t_test)}")
## Number identified as RBPs (T-Test positive): 230
glue("Hit rate: {round(length(overlap_t_test)/length(overlap_MS_Table)*100, 2)}%")
## Hit rate: 42.36%
# Determine number of identified RBS not listed as RBP in UniProt 
novel_candidates <- t_test_positives[!t_test_positives %in% known_rbps]
glue("Novel candidates identified as RBP but not listed in UniProt: {(length(novel_candidates))}")
## Novel candidates identified as RBP but not listed in UniProt: 564
# Positive Control Validation

  # RBM10, SMN1, and FMR1 are well-documented RBPs (literature-supported)
  pos_controls <- c("RBM10_HUMAN", "SMN_HUMAN", "FMR1_HUMAN")

  # Check whether they are in our dataset and classified as T-test positives
  pos_df <- data.frame(
    Protein_Positive_Control = pos_controls,
    InDataset = pos_controls %in% rownames(MS_Table),
    TTest = pos_controls %in% t_test_positives)
  print(pos_df)
##   Protein_Positive_Control InDataset TTest
## 1              RBM10_HUMAN      TRUE  TRUE
## 2                SMN_HUMAN      TRUE  TRUE
## 3               FMR1_HUMAN      TRUE  TRUE
# Positive Control Validation
  
  # ACTB, SDHB, and COX4I1 are typically not considered RNA-binding (negative controls)
  neg_controls <- c("ACTB_HUMAN", "SDHB_HUMAN", "COX41_HUMAN")
  
  # Same check as above for negatives
  neg_df <- data.frame(
    Protein_Negative_Control = neg_controls,
    InDataset = neg_controls %in% rownames(MS_Table),
    TTest = neg_controls %in% t_test_positives)
  print(neg_df)
##   Protein_Negative_Control InDataset TTest
## 1               ACTB_HUMAN      TRUE FALSE
## 2               SDHB_HUMAN      TRUE FALSE
## 3              COX41_HUMAN      TRUE FALSE

Evaluation of T-Test Limitations - Lina, Cihan, Sofia

Goal:

This section investigates potential false negatives from the T-test-based RBP identification.
- Explore cases where known RBPs were not classified as significant and examine possible causes. - Manual inspection of different samples of 40 proteins: - Not identified RBPs to assess false negative rate. - T-Test positives to assess false positive rate. - Borderline cases (0.05 > p-value > 0.045)

# Identify known UniProt RBPs not found by T-Test
not_identified <- known_rbps[! known_rbps %in% t_test_positives & known_rbps %in% rownames(MS_Table)]

# Total "missed" RBPs present in dataset
glue("RNA-binding/-interacting proteins (UniProt) analyzed but not detected in our data: {length(not_identified)}")
## RNA-binding/-interacting proteins (UniProt) analyzed but not detected in our data: 313
# Analyze potential causes
shapiro_failed <- Shift_Data[is.na(Shift_Data$shapiro_p),]
percentage_shapiro_failed <- sum(is.na(Shift_Data[not_identified, "shapiro_p"])) /length(not_identified) *100
glue("Number of Proteins, that were excluded, because shapiro results where NA: {nrow(shapiro_failed)}")
## Number of Proteins, that were excluded, because shapiro results where NA: 1218
glue("Approx. Percentage of not identified RBPs that where lost in Shapiro-Wilk test: {round(percentage_shapiro_failed, 1)} %")
## Approx. Percentage of not identified RBPs that where lost in Shapiro-Wilk test: 11.5 %
normality_failed <- Shift_Data[Shift_Data$normal_distributed == FALSE,]
percentage_normality_failed <- sum(Shift_Data[not_identified, "normal_distributed"] == FALSE) / length(not_identified) *100
glue("Number of Proteins, that where excluded, because replicates wheren't normally distributed: {nrow(normality_failed)}")
## Number of Proteins, that where excluded, because replicates wheren't normally distributed: 311
glue("Approx. Percentage of not identified RBPs that where lost in normality test: {round(percentage_normality_failed, 1)} %")
## Approx. Percentage of not identified RBPs that where lost in normality test: 9.9 %
#Visual inspection of a sample of non-identified proteins
sample_not_identified <- head( not_identified, 40)
for (s in sample_not_identified ){
  plot_protein(s)
}

glue("Manual inspection of 40 non-identified RBPs suggests a false negative rate of approximately 13% (5 out of 40) within the testable subset (visual shift despite test results)")
## Manual inspection of 40 non-identified RBPs suggests a false negative rate of approximately 13% (5 out of 40) within the testable subset (visual shift despite test results)
#Visual inspection of a sample of T-Test positive proteins
sample_t_test_positives <-head(t_test_positives,40)
for (s in sample_t_test_positives){
  plot_protein(s)
}

glue("From a sample of 40 T-test-positive proteins, only 2 would have been visually rejected as RBPs, corresponding to a 5% false positive rate.")
## From a sample of 40 T-test-positive proteins, only 2 would have been visually rejected as RBPs, corresponding to a 5% false positive rate.
# Visual inspection of proteins with marginally significant p-values (above 0.045)
t_test_positives_critical_p_value <- rownames(
  Shift_Data[
    Shift_Data$significant_ttest_filtered == TRUE &
    !is.na(Shift_Data$significant_ttest_filtered) &
    Shift_Data$p_value_ttest_filtered > 0.045,
  ]
)
for (s in t_test_positives_critical_p_value){
  plot_protein(s)
}

glue("Furthermore, of all proteins with p-values > 0.045 (n = 18), only 4 were judged inconsistent with an RBP-like shift profile, suggesting that most borderline cases still show convincing shift patterns.")
## Furthermore, of all proteins with p-values > 0.045 (n = 18), only 4 were judged inconsistent with an RBP-like shift profile, suggesting that most borderline cases still show convincing shift patterns.

###Plot Validation of T-test with UniProt Data - Lina ### Goal: - This plot extends the CoM-based T-test visualization by highlighting known RBPs from UniProt. - This view helps assess the sensitivity and limitations of the shift-based identification method relative to external RBP annotations.

# Create data subset of proteins for highlighting (was necessary for coloring sequence)
highlight_data <- subset(Shift_Data,
                         rownames(Shift_Data) %in% overlap_t_test |
                         rownames(Shift_Data) %in% not_identified |
                         rownames(Shift_Data) %in% novel_candidates)

# Classify each protein into a color group
highlight_data$color_group <- case_when(
  rownames(highlight_data) %in% overlap_t_test ~ "Overlapping with UniProt",
  rownames(highlight_data) %in% not_identified ~ "UniProt Proteins not identified",
  rownames(highlight_data) %in% novel_candidates ~ "Novel Candidates"
)

# Control transparency depending on group
highlight_data$alpha_group <- case_when(
  highlight_data$color_group == "Overlapping with UniProt" ~ "high",
  highlight_data$color_group == "UniProt Proteins not identified" ~ "high",
  highlight_data$color_group == "Novel Candidates" ~ "low"
)

# Base plot: all proteins shown in background
ggplot(Shift_Data, aes(x = `CoM_Ctrl`, y = `CoM_RNase`)) +
  
    # Background: all non-highlighted proteins in light gray
    geom_point(aes(color = "No significant Shift"), alpha = 0.3, show.legend = FALSE) +
  
    # Highlighted groups: UniProt hits, misses, and T-test positives
    geom_point(data = highlight_data,
             aes(color = color_group, alpha = alpha_group)) +
    scale_color_manual(name = "Significance as RBP",values = c(
    "No significant Shift" = "gray80",
    "Novel Candidates" = "lightcoral",
    "Overlapping with UniProt" = "darkblue",
    "UniProt Proteins not identified" = "#009ACD"
    ))+
    scale_alpha_manual(values = c("high" = 0.7, "low" = 0.3), guide = "none") +
  
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray20") +
  labs(title = "Validation of T-Test: Comparing with UniProt Data",
       x = "Center of Mass - Control",
       y = "Center of Mass - RNase") +
  theme_minimal()+
  theme(
    legend.position = c(0.02, 0.98),
    legend.justification = c("left", "top"),
    legend.background = element_rect(fill = "white",color = "gray70")
    )
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

T-Test for non-synchronised HELA Cells - Lina

  • This section reproduces all necessary pipeline steps with adjusted variable names.
    NOTE: All variables related to the non-synchronized dataset are suffixed with _NS.
# Loading Data
MS_Table_NS <- read.table("RDeeP_HeLa_NS.csv", header = TRUE, row.names = 1, sep = ";")

# Dataframe
treatment <- factor(rep(c("Ctrl", "RNase"), each = 3, length.out = 150))
replicate <- factor(rep(c("Ctrl_Rep1", "Ctrl_Rep2", "Ctrl_Rep3", "RNase_Rep1","RNase_Rep2", "RNase_Rep3"),25))
fraction <- factor(rep(paste0("Fraction_", 1:25), each = 6))

data <- data.frame(rownames = colnames(MS_Table),treatment = treatment, replicate = replicate, fraction = fraction)

# Normalization of triplicates
average.list.NS <- list() 

for (f in levels(fraction)) {
  
  cols.Ctrl <- which(fraction == f & treatment == "Ctrl") 
  average.Ctrl <- rowMeans(MS_Table_NS[,cols.Ctrl]) 
  average.list.NS[[paste0("Ctrl_",f)]] <- average.Ctrl 
  
  cols.RNase <- which(fraction == f & treatment == "RNase")
  average.RNase <- rowMeans(MS_Table_NS[,cols.RNase])
  average.list.NS[[paste0("RNase_",f)]] <- average.RNase
}

MS_Table_Averages_NS <- as.data.frame(average.list.NS) 
rownames(MS_Table_Averages_NS) <- rownames(MS_Table_NS)

fractions <- paste0("Fraction_", 1:25)

ordered_names <- as.vector(rbind(
  paste0("Ctrl_", fractions),
  paste0("RNase_", fractions)
))

MS_Table_Averages_NS <- MS_Table_Averages_NS[, ordered_names]

# Normalization to 100

ctrl_cols <- grep("^Ctrl_", colnames(MS_Table_Averages_NS)) 
rnase_cols <- grep("^RNase_", colnames(MS_Table_Averages_NS))

ctrl_norm <- MS_Table_Averages_NS[, ctrl_cols] / rowSums(MS_Table_Averages_NS[, ctrl_cols]) * 100
rnase_norm <- MS_Table_Averages_NS[, rnase_cols] / rowSums(MS_Table_Averages_NS[, rnase_cols]) * 100

MS_Table_Norm_NS <- cbind(ctrl_norm, rnase_norm)
MS_Table_Norm_NS <- MS_Table_Norm_NS[, colnames(MS_Table_Averages_NS)]

# Function for protein plotting 
plot_protein_NS <- function(x) {
  protein_of_interest <- x 
  protein_row <- MS_Table_Norm_NS[rownames(MS_Table_Norm_NS) == protein_of_interest, ]
  ctrl_values <- as.numeric(protein_row[seq(1, 49, by=2)])
  rnase_values <- as.numeric(protein_row[seq(2, 50, by=2)])
  fractions <- 1:25
  max_val <- max(ctrl_values, rnase_values, na.rm = TRUE)
  ylim_range <- c(0, max_val * 1.1)

  plot(fractions, ctrl_values, type="o", pch=20, lty=1, lwd =1.5, col="forestgreen", ylim=range(c(ctrl_values, rnase_values)),
       xlab="Fraction", ylab="Normalized Intensity", main= paste(protein_of_interest, "non-synchronised"), axes = FALSE)
       
  lines(fractions, rnase_values, type="o",pch =20, lty=1,lwd =1.5, col="firebrick3")
  
  polygon(c(fractions, rev(fractions)), 
          c(ctrl_values, rep(0, length(ctrl_values))), 
          col=adjustcolor("forestgreen", alpha.f=0.1), border=NA)
  
  polygon(c(fractions, rev(fractions)), 
          c(rnase_values, rep(0, length(rnase_values))), 
          col=adjustcolor("firebrick3", alpha.f=0.1), border=NA)
  
  axis(1, at = 1:25, labels = 1:25, cex.axis = 0.7)
  axis(2, cex.axis = 0.7, las =2)
  grid(nx = NULL, ny = NULL, col = "lightgray", lty = "dotted", lwd = 0.8)
  
  legend("topright", legend=c("Ctrl", "RNase"), col=c("forestgreen", "firebrick3"), 
         lty=1, lwd = 1.5,  pch=20, bg = "white", bty = "o", box.col = NA)
}

# Shift Characteristics based on normalized Data and CoM
x <- 1:25

CoM_Ctrl_NS <- numeric(nrow(MS_Table_Norm_NS))
CoM_RNase_NS <- numeric(nrow(MS_Table_Norm_NS))
shift_distance_NS <- numeric(nrow(MS_Table_Norm_NS))
shift_direction_NS <- numeric(nrow(MS_Table_Norm_NS))

for (p in seq_len(nrow(MS_Table_Norm_NS))) {

  row_vals <- as.numeric(MS_Table_Norm_NS [p,])
  ctrl_vals <- row_vals[seq(1, 50, by =2)]
  rnase_vals <- row_vals[seq(2, 50, by=2)]

  CoM_Ctrl_NS[p] <- sum(ctrl_vals * x) / sum(ctrl_vals)
  CoM_RNase_NS[p] <- sum(rnase_vals * x) / sum(rnase_vals)
  shift_distance_NS[p] <- CoM_Ctrl_NS[p] - CoM_RNase_NS[p] 
  shift_direction_NS[p] <- sign(shift_distance_NS[p]) 
}

Shift_Data_NS <- data.frame(
  CoM_Ctrl= CoM_Ctrl_NS,
  CoM_RNase = CoM_RNase_NS,
  shift_distance = shift_distance_NS,
  shift_direction= shift_direction_NS,
  row.names = rownames(MS_Table_Norm_NS))

# Normalization of MS_Table_NS to 100

ctrl_cols_t1 <- grep("_Ctrl_Rep1", colnames(MS_Table_NS))
ctrl_cols_t2 <- grep("_Ctrl_Rep2", colnames(MS_Table_NS))
ctrl_cols_t3 <- grep("_Ctrl_Rep3", colnames(MS_Table_NS))
RNase_cols_t1 <- grep("_RNase_Rep1", colnames(MS_Table_NS))
RNase_cols_t2 <- grep("_RNase_Rep2", colnames(MS_Table_NS))
RNase_cols_t3 <- grep("_RNase_Rep3", colnames(MS_Table_NS))

ctrl_norm_t1 <- MS_Table_NS[, ctrl_cols_t1] / rowSums(MS_Table_NS[, ctrl_cols_t1]) * 100
ctrl_norm_t2 <- MS_Table_NS[, ctrl_cols_t2] / rowSums(MS_Table_NS[, ctrl_cols_t2]) * 100
ctrl_norm_t3 <- MS_Table_NS[, ctrl_cols_t3] / rowSums(MS_Table_NS[, ctrl_cols_t3]) * 100
RNase_norm_t1 <- MS_Table_NS[, RNase_cols_t1] / rowSums(MS_Table_NS[, RNase_cols_t1]) * 100
RNase_norm_t2 <- MS_Table_NS[, RNase_cols_t2] / rowSums(MS_Table_NS[, RNase_cols_t2]) * 100
RNase_norm_t3 <- MS_Table_NS[, RNase_cols_t3] / rowSums(MS_Table_NS[, RNase_cols_t3]) * 100

Norm_Data_for_t_NS <- cbind(ctrl_norm_t1, RNase_norm_t1, ctrl_norm_t2, RNase_norm_t2, ctrl_norm_t3, RNase_norm_t3)

fractions <- 1:25
reps <- 1:3
conditions <- c("Ctrl", "RNase") 
ordered_names <- c()
for (fraction in fractions) {
  for (rep in reps) {
    for (condition in conditions) {
      name <- paste0("Fraction", fraction, "_", condition, "_Rep", rep)
      ordered_names <- c(ordered_names, name)
    }
  }
}
Norm_Data_for_t_NS <- Norm_Data_for_t_NS[, ordered_names]

# Calculation of CoM for T-Test

fractions <- 1:25
conditions <- c("Ctrl", "RNase")
replicates <- c("Rep1", "Rep2", "Rep3")

results_com_NS <- list()

for (cond in conditions) {
  for (rep in replicates) {
    pattern <- paste0("^Fraction", fractions, "_", cond, "_", rep, "$")
    cols <- grep(paste(pattern, collapse="|"), colnames(Norm_Data_for_t_NS), value = TRUE)  
    subdata <- Norm_Data_for_t_NS[, cols]   
    subdata <- subdata[, order(as.numeric(gsub("Fraction(\\d+)_.*", "\\1", cols)))]  
    com <- apply(subdata, 1, function(x) {
      sum(x * fractions) / sum(x)
    })

    results_com_NS[[paste(cond, rep, sep = "_")]] <- com
  }
}

CoM_Data_for_t_NS <- as.data.frame(results_com_NS)  

rownames(CoM_Data_for_t_NS) <- rownames(Norm_Data_for_t_NS)

# Calculating Shift-Values for T-Test

Shift_Data_for_t_NS <- data.frame(
  shift_distance_rep1 = CoM_Data_for_t_NS[, 1] - CoM_Data_for_t_NS[, 4],
  shift_distance_rep2 = CoM_Data_for_t_NS[, 2] - CoM_Data_for_t_NS[, 5],
  shift_distance_rep3 = CoM_Data_for_t_NS[, 3] - CoM_Data_for_t_NS[, 6],
  row.names = rownames(CoM_Data_for_t_NS)
)

# T-Test

Shift_Data_NS$shapiro_p <- NA
Shift_Data_NS$normal_distributed <- NA
Shift_Data_NS$p_value_ttest_filtered <- NA
Shift_Data_NS$significant_ttest_filtered <- NA

for (protein in rownames(Shift_Data_for_t_NS)) {
 
  shift_values <- as.numeric(Shift_Data_for_t_NS[protein, c("shift_distance_rep1", "shift_distance_rep2", "shift_distance_rep3")])

  if (all(!is.na(shift_values))) {

    if (length(unique(shift_values)) == 1) {
      Shift_Data_NS[protein, "shapiro_p"] <- NA
      Shift_Data_NS[protein, "normal_distributed"] <- TRUE
      
    } else {
      shapiro_result <- shapiro.test(shift_values)
      Shift_Data_NS[protein, "shapiro_p"] <- shapiro_result$p.value
      Shift_Data_NS[protein, "normal_distributed"] <- shapiro_result$p.value >= 0.05
    }
    
    if (Shift_Data_NS[protein, "normal_distributed"] == TRUE) {
      test_result <- t.test(shift_values, mu = 1, alternative = "greater")
      Shift_Data_NS[protein, "p_value_ttest_filtered"] <- test_result$p.value
      Shift_Data_NS[protein, "significant_ttest_filtered"] <- ifelse(test_result$p.value < 0.05, TRUE, FALSE)
      
    } else {
      Shift_Data_NS[protein, "p_value_ttest_filtered"] <- NA
      Shift_Data_NS[protein, "significant_ttest_filtered"] <- NA
    }
  }
}
# Analysis of results for non-synchronized HeLa Cells
table(Shift_Data_NS$significant_ttest_filtered, useNA = "ifany")
## 
## FALSE  TRUE  <NA> 
##  3717   724   324

Comparison of Proteins found as RBP in Mitosis vs. in NS_Hela - Lina

Goal:

  • Compare proteins identified as RNA-binding by T-test in mitotic vs. non-synchronized HeLa cells. Analyze the overlap between datasets, list proteins uniquely shifted during mitosis, and visualize differences in shift characteristics.
# Identify proteins classified as significant RBPs in non-synchronized HeLa cells
t_test_positives_NS <- rownames(Shift_Data_NS[Shift_Data_NS$significant_ttest_filtered == TRUE & !is.na(Shift_Data_NS$significant_ttest_filtered), ])

# Determine protein overlapp across datasets
overlap_analysis <- intersect(rownames(MS_Table_NS), rownames(MS_Table))
glue("Number of Proteins in sample NS_Hela: {length(rownames(MS_Table_NS))}")
## Number of Proteins in sample NS_Hela: 4765
glue("Number of Proteins analysed in both samples: {length(overlap_analysis)}")
## Number of Proteins analysed in both samples: 4101
# Identify proteins found as RBPs in both conditions
overlap_t_test_positivs <- intersect(t_test_positives, t_test_positives_NS)
glue("Number of RBPs found in both samples: {length(overlap_t_test_positivs)}")
## Number of RBPs found in both samples: 376
# Identify RBPs found *only* in mitosis, and also present in both datasets
RBPs_Mitosis <- setdiff(t_test_positives, t_test_positives_NS)
RBPs_Mitosis <- intersect(RBPs_Mitosis, overlap_analysis)
glue("Number RBPs found only in Mitosis: {length(RBPs_Mitosis)}")
## Number RBPs found only in Mitosis: 237
# Number of RBPs found *only* in non-synchonized cells, and also present in both datasets
RBPs_Hela <- setdiff(t_test_positives_NS, t_test_positives)
RBPs_Hela <- intersect(RBPs_Hela, overlap_analysis)
glue("Number RBPs found only in non-synchonized cells: {length(RBPs_Hela)}")
## Number RBPs found only in non-synchonized cells: 298
# Display a sample of plots (mitosis and non-synchronised) of RBPs uniquely active in mitosis
sample_RBPs_Mitosis <- head(RBPs_Mitosis,10)
for (r in sample_RBPs_Mitosis){
  plot_protein(r)
  plot_protein_NS(r)
}

# Visualise differences in shift characteristics

# Define variables to extract from both datasets
cols_to_extract <- c("CoM_Ctrl", "CoM_RNase", "shift_distance", "shift_direction", "p_value_ttest_filtered", "significant_ttest_filtered")

# Subset and rename columns for both conditions
subset_data_S <- Shift_Data[overlap_analysis, cols_to_extract]
subset_data_NS <- Shift_Data_NS[overlap_analysis, cols_to_extract]
colnames(subset_data_S) <- paste0(colnames(subset_data_S), "_mitosis")
colnames(subset_data_NS) <- paste0(colnames(subset_data_NS), "_nonSync")

# Merge into one combined dataframe
Shift_Data_Combined <- cbind(subset_data_S,subset_data_NS)
head(Shift_Data_Combined)
##             CoM_Ctrl_mitosis CoM_RNase_mitosis shift_distance_mitosis
## AHNK_HUMAN          6.838550          6.804008             0.03454188
## PRKDC_HUMAN        14.629323         13.796488             0.83283494
## ACTB_HUMAN          5.876815          5.979347            -0.10253220
## ACTG_HUMAN          5.876815          5.979347            -0.10253220
## ACTC_HUMAN          5.836062          5.925910            -0.08984742
## ACTS_HUMAN          5.836062          5.925910            -0.08984742
##             shift_direction_mitosis p_value_ttest_filtered_mitosis
## AHNK_HUMAN                        1                      0.9975260
## PRKDC_HUMAN                       1                      0.8134208
## ACTB_HUMAN                       -1                      0.9934067
## ACTG_HUMAN                       -1                      0.9934067
## ACTC_HUMAN                       -1                      0.9935800
## ACTS_HUMAN                       -1                      0.9935800
##             significant_ttest_filtered_mitosis CoM_Ctrl_nonSync
## AHNK_HUMAN                               FALSE         10.09021
## PRKDC_HUMAN                              FALSE         14.83020
## ACTB_HUMAN                               FALSE         12.09840
## ACTG_HUMAN                               FALSE         12.09840
## ACTC_HUMAN                               FALSE         12.17772
## ACTS_HUMAN                               FALSE         12.17772
##             CoM_RNase_nonSync shift_distance_nonSync shift_direction_nonSync
## AHNK_HUMAN           9.193942             0.89627235                       1
## PRKDC_HUMAN         14.358327             0.47187324                       1
## ACTB_HUMAN          12.126841            -0.02843975                      -1
## ACTG_HUMAN          12.126841            -0.02843975                      -1
## ACTC_HUMAN          12.189021            -0.01129869                      -1
## ACTS_HUMAN          12.189021            -0.01129869                      -1
##             p_value_ttest_filtered_nonSync significant_ttest_filtered_nonSync
## AHNK_HUMAN                      0.00703533                               TRUE
## PRKDC_HUMAN                     0.03095199                               TRUE
## ACTB_HUMAN                      0.81047299                              FALSE
## ACTG_HUMAN                      0.81047299                              FALSE
## ACTC_HUMAN                      0.80948945                              FALSE
## ACTS_HUMAN                      0.80948945                              FALSE
# Scatterplot of shift distances

ggplot(Shift_Data_Combined, aes(x = `shift_distance_mitosis`,
                                y = `shift_distance_nonSync`,
                                color = case_when(
                                  !`significant_ttest_filtered_mitosis` & !`significant_ttest_filtered_nonSync` ~ "No significant RBPs",
                                  `significant_ttest_filtered_mitosis` & !`significant_ttest_filtered_nonSync` ~ "RBPs only active in Mitosis",
                                  !`significant_ttest_filtered_mitosis` & `significant_ttest_filtered_nonSync` ~ "RBPs not active in Mitosis",
                                  TRUE ~ "RBPs always active"
                                  ))) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  scale_color_manual(name = "Significance as RBP",
                     values = c(
                       "No significant RBPs" = "grey80",
                       "RBPs not active in Mitosis" = "gray30",
                       "RBPs always active" = "lightcoral",
                       "RBPs only active in Mitosis" = "darkred"
                       ))+
  labs(title = "Shift Distance: Mitosis vs. Non-Synchronized",
       x = "Shift Distance (Mitosis)",
       y = "Shift Distance (Non-Synchronized)") +
  theme_minimal()

Further Data Analysis

Screening for RBP complexes active in Mitosis - Sofia and Cihan

Selection of clustering criteria and hierarchical clustering

Goal:

Select necessary parameters/dimensions and get an idea of the distribution of clusters.

Additional Information - Peak_data is mainly taken into consideration, specifically peak height (not position or number of peaks) we only focus on Ctrl samples, not RNase as we expect different proteins to have different distributions after RNase degradation, even when they are in the same complex. Following this logic shift data is also irrelevant. To this data we add COM of Ctrl, here peak position is relevant. Overall similar height is more important than similar peak fraction. Runs only on set of RBPS exculsively active in mitosis . Wards method (euclidean distance applied), !i think here we have to say it did not work.

#from the two shift and peak characteristics we extract some variables creating a new merged table which also has only the names of the RBPs that are aktive in mitosis !! hier stimmt was nicht.

# 1) Filter only the RBPs relevant in Mitosis 

filtered_shift_Mitosis <- Shift_Data[rownames(Shift_Data) %in% RBPs_Mitosis, ]

filtered_peak_Mitosis <- Peak_Data[rownames(Peak_Data) %in% RBPs_Mitosis, ]

# 2) Selecting only the interesting coloumns creating comon dataframe

merged_filterd_mitosis <- cbind(
  filtered_shift_Mitosis[, c("CoM_Ctrl")], # selecting a specific col
  filtered_peak_Mitosis %>% select(ends_with("height_Ctrl")) # choosing only col that end with
)

head(merged_filterd_mitosis)
##             filtered_shift_Mitosis[, c("CoM_Ctrl")] peak1_height_Ctrl
## ACL6A_HUMAN                                12.13922          9.070790
## AFF1_HUMAN                                 14.56960         21.084274
## AIMP1_HUMAN                                16.49765         11.395723
## AK17A_HUMAN                                16.05941         25.297419
## AKAP8_HUMAN                                16.65152          5.437610
## AKP8L_HUMAN                                16.08390          9.931061
##             peak2_height_Ctrl peak3_height_Ctrl peak4_height_Ctrl
## ACL6A_HUMAN         16.479083          4.160535          5.201113
## AFF1_HUMAN          66.793791                NA                NA
## AIMP1_HUMAN          6.279209          7.588047                NA
## AK17A_HUMAN          4.326068         11.392133         12.037883
## AKAP8_HUMAN          4.592040          8.754841          7.886461
## AKP8L_HUMAN         10.206197          7.455709          3.095471
##             peak5_height_Ctrl peak6_height_Ctrl
## ACL6A_HUMAN                NA                NA
## AFF1_HUMAN                 NA                NA
## AIMP1_HUMAN                NA                NA
## AK17A_HUMAN         11.186555          10.56167
## AKAP8_HUMAN          7.547957                NA
## AKP8L_HUMAN                NA                NA
# 3) clean data for further steps 

# Replace NA values with 0 (when there was no peak there was not protein)
clean_merged_filterd_mitosis <- merged_filterd_mitosis
clean_merged_filterd_mitosis[is.na(clean_merged_filterd_mitosis)] <- 0


# 4) do hirarchical clustering in order to analize number of clusters that could be found


# Distance matrix (Euclidean)
dist_matrix <- dist(clean_merged_filterd_mitosis)

# Hierarchical clustering (Wards)
hc_ward <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
par(mfrow = c(2, 2))
#plot(hc_ward, main = "Ward's Method")  was sollte das 


plot(hc_ward, labels = FALSE, main = "Ward's Method (no labels)")

# image for download: 

png("C://Users//Sofi//Desktop//4 Semester//Bioinfo_project//Downloadsdendrogram_large.png", width = 1200, height = 800)
plot(hc_ward, labels = FALSE, main = "Ward's Method (Large Output)")
dev.off()
## Warning in dev.off(): not a supported scheme, no image data written
## quartz_off_screen 
##                 2

Using DBSCAN algorithm

Goal:

Find and applying a clustering method, that will highlight proteins, which could be working in a complex.

Additional Information: DBSCAN is a function that can classify points in low-density regions as “noise.” This means clusters are not only defined by distance (like in k-means), but also by density. The code I wrote has two important parameters. These parameters are:

ε (epsilon): The maximum distance between two points to be considered neighbors. If this number is very small, everything is classified as noise; if it’s too large, noise may be included as relevant (resulting in one huge cluster).

MinPts: The minimum number of neighbors required for a point to be considered a so-called core point. A core point must have at least MinPts other points within ε. If this value is too low, even single points can form clusters. On the other hand, if it’s too high, only large clusters are formed and smaller ones might not be recognized at all. There are also border points, which are within ε distance of a core point but do not have enough neighbors themselves to be core points.

Everything that does not meet these conditions is filtered out and labeled as noise.

# 1) Scale data

scaled_data <- scale(clean_merged_filterd_mitosis) # substracts mean, scales it by dividing sd=> data was normalized per row => here Goal is compare Proteins 

# 2) DBSCAN # its important to choose eps and minPts correctly, eps is the max distance for two points to be neighbors, minPts  minimum number of points required to form a dense region.

db <- dbscan(scaled_data, eps = 0.7, minPts = 4) # parameters can be changed

# 3) PCA for plotting

pca <- prcomp(scaled_data) # function for PCA
pca_df <- as.data.frame(pca$x[, 1:2])  # PC1 and PC2, for all Proteins 
pca_df$cluster <- as.factor(db$cluster)
pca_df$protein <- rownames(clean_merged_filterd_mitosis)  # Protein names from cleaned data 

# 4) Plot with noise in grey

ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster, label = ifelse(cluster == 0, "", protein))) +
  geom_point(size = 2) +
  geom_text(check_overlap = TRUE, size = 2.5, vjust = -1) +
  scale_color_manual(values = c("0" = "grey", "1" = "blue", "2" = "red", "3" = "green", "4" = "purple", "5" = "orange")) +
  theme_minimal() +
  labs(title = "DBSCAN Clustering of Proteins",
       subtitle = "Visualized with PCA (2D)",
       color = "Cluster") +
  theme(plot.title = element_text(face = "bold"))

# 5) Create table: which proteins are in which cluster

clustered_proteins <- split(rownames(clean_merged_filterd_mitosis), db$cluster) # asignes the cluster to each Protein name from original dataframe 

# Print each cluster's members
for (cl in sort(unique(db$cluster))) { # loops over each cluster number in sorted order
  
  cat(paste0("Cluster ", cl, ifelse(cl == 0, " (noise)", ""), ": ")) # pastes to console , separation of cluster name from list of Proteins 
  
  cat(paste(clustered_proteins[[as.character(cl)]], collapse = ", ")) # extracts list of proteins and collapses them 
  
  cat("\n\n") # adds spacing between clusters 
}
## Cluster 0 (noise): ACL6A_HUMAN, AFF1_HUMAN, AK17A_HUMAN, ANO8_HUMAN, ASCC2_HUMAN, ASCC3_HUMAN, ATLA2_HUMAN, ATRX_HUMAN, BCL7A_HUMAN, BCL7B_HUMAN, BLM_HUMAN, BOP1_HUMAN, BRD3_HUMAN, BRD7_HUMAN, BRX1_HUMAN, BTF3_HUMAN, CCNK_HUMAN, CCNT1_HUMAN, CLAP1_HUMAN, CPSF3_HUMAN, CSTFT_HUMAN, DDRGK_HUMAN, DDX20_HUMAN, DDX3Y_HUMAN, DDX50_HUMAN, DHX30_HUMAN, DHX37_HUMAN, DHX8_HUMAN, DHX9_HUMAN, DICER_HUMAN, DMAP1_HUMAN, DNJA3_HUMAN, DNJC2_HUMAN, DSRAD_HUMAN, DYL1_HUMAN, ECT2_HUMAN, EFHD1_HUMAN, EI2BD_HUMAN, EI2BG_HUMAN, EP400_HUMAN, EPIPL_HUMAN, ERAL1_HUMAN, ERCC6_HUMAN, EZH2_HUMAN, F133B_HUMAN, FUBP3_HUMAN, FXR1_HUMAN, FXR2_HUMAN, GELS_HUMAN, GEMI4_HUMAN, GLYR1_HUMAN, GPTC4_HUMAN, GRWD1_HUMAN, H2B2C_HUMAN, H2B2D_HUMAN, H3C_HUMAN, HEXI1_HUMAN, HJURP_HUMAN, HNRH1_HUMAN, HNRLL_HUMAN, HNRPC_HUMAN, HNRPD_HUMAN, IF2G_HUMAN, INTU_HUMAN, KC1A_HUMAN, KCNH1_HUMAN, KHDR3_HUMAN, KI67_HUMAN, KRI1_HUMAN, LAR1B_HUMAN, LARP1_HUMAN, LSG1_HUMAN, MBOA5_HUMAN, MDC1_HUMAN, MET16_HUMAN, MMTA2_HUMAN, MO4L2_HUMAN, MOV10_HUMAN, MRCKA_HUMAN, MSI1H_HUMAN, MYO10_HUMAN, NAT10_HUMAN, NMT1_HUMAN, NO40_HUMAN, NOC2L_HUMAN, NPA1P_HUMAN, NUMA1_HUMAN, NUMBL_HUMAN, NUMB_HUMAN, NUSAP_HUMAN, OLA1_HUMAN, PAPD1_HUMAN, PCF11_HUMAN, PESC_HUMAN, PHF5A_HUMAN, PKCB1_HUMAN, PLEC_HUMAN, POP7_HUMAN, PPIG_HUMAN, PRC2A_HUMAN, PRP4_HUMAN, PSPC1_HUMAN, PUM1_HUMAN, PUM2_HUMAN, PURA_HUMAN, PWP2_HUMAN, PYRD_HUMAN, RALY_HUMAN, RBM27_HUMAN, RBM47_HUMAN, RBM5_HUMAN, REN3B_HUMAN, RFA2_HUMAN, RFIP1_HUMAN, RFOX1_HUMAN, RFOX2_HUMAN, RGAP1_HUMAN, RHG05_HUMAN, RHOF_HUMAN, RL22L_HUMAN, RL22_HUMAN, RL3L_HUMAN, RLA1_HUMAN, RPB1_HUMAN, RPB2_HUMAN, RPC1_HUMAN, RS12_HUMAN, RS27_HUMAN, RUXF_HUMAN, S39A7_HUMAN, SAFB1_HUMAN, SGPL1_HUMAN, SMU1_HUMAN, SPAS2_HUMAN, SRP68_HUMAN, SRS10_HUMAN, SRS12_HUMAN, SUGP2_HUMAN, SUZ12_HUMAN, SYF1_HUMAN, TB10B_HUMAN, TCF25_HUMAN, TERF2_HUMAN, TF2B_HUMAN, TF3C5_HUMAN, TFB2M_HUMAN, TFP11_HUMAN, THOC5_HUMAN, TIM_HUMAN, TM7S3_HUMAN, TMX4_HUMAN, TNPO3_HUMAN, TOE1_HUMAN, TPX2_HUMAN, TRA2B_HUMAN, TRI26_HUMAN, TRI33_HUMAN, TRIPB_HUMAN, TRIPC_HUMAN, TRM2A_HUMAN, TRM6_HUMAN, UIMC1_HUMAN, WDR33_HUMAN, WDR43_HUMAN, WIZ_HUMAN, YTDC2_HUMAN, ZC3H1_HUMAN, ZCH18_HUMAN, ZN281_HUMAN, ZN318_HUMAN, ZN598_HUMAN
## 
## Cluster 1: AIMP1_HUMAN, AKP8L_HUMAN, ATP6_HUMAN, S61A1_HUMAN, S61A2_HUMAN
## 
## Cluster 2: AP3M2_HUMAN, CNDG2_HUMAN, DDX6_HUMAN, DUS2L_HUMAN, DYST_HUMAN, FGL2_HUMAN, GEMI5_HUMAN, HEXA_HUMAN, HNRPF_HUMAN, IF4G1_HUMAN, LPPRC_HUMAN, NSUN2_HUMAN, UTRO_HUMAN
## 
## Cluster 3: APTX_HUMAN, NKRF_HUMAN, OAS3_HUMAN, PTCD1_HUMAN, RL30_HUMAN, RL34_HUMAN, TF3C1_HUMAN, TRM1L_HUMAN, TUT7_HUMAN, ZBT11_HUMAN
## 
## Cluster 4: CBX8_HUMAN, FMR1_HUMAN, KIFC1_HUMAN, RBM6_HUMAN, RL23_HUMAN, RL27_HUMAN, RS26L_HUMAN, RS26_HUMAN, RS2_HUMAN, RS6_HUMAN, TRI25_HUMAN, UBF1_HUMAN, ZCCHV_HUMAN
## 
## Cluster 5: ELYS_HUMAN, KIF14_HUMAN, NOP58_HUMAN, PRPF3_HUMAN, RBBP6_HUMAN, RFC1_HUMAN, ZC11A_HUMAN
## 
## Cluster 6: H31T_HUMAN, H31_HUMAN, H32_HUMAN, H33_HUMAN
## 
## Cluster 7: CKAP5_HUMAN, ESF1_HUMAN, KIN17_HUMAN, TF3C4_HUMAN, TOP2B_HUMAN
## 
## Cluster 8: AKAP8_HUMAN, DDX3X_HUMAN, NCBP2_HUMAN, RS27L_HUMAN, SRSF1_HUMAN
## 
## Cluster 9: RL24_HUMAN, SMCA5_HUMAN, TBL2_HUMAN, TMEM9_HUMAN

Looking at plots of different clusters

###Goal: Looking if the plots look similar or not in order to validate the parameters choosen above.

Note! This code can be always run to get a better feeling over clusters

# Code from ploting a protein to visualize the proteins that seam to make a complex
# View Examples

sample_RBPs_Cluster <- c("RL22_HUMAN", "RL23_HUMAN", "RL24_HUMAN", "RL27_HUMAN", "RL30_HUMAN", "RL34_HUMAN", "RLA1_HUMAN", "RS26_HUMAN", "RS2_HUMAN", "RS6_HUMAN", "RS12_HUMAN") # insert names of clustered proteins you want to compare 

for (r in sample_RBPs_Cluster){
  plot_protein(r)
}

Finding the right parameters to choose the perfect cluster size

###Goal: Getting a good idea of how parameters affect our clustering results, by trying all combinations and creating a heat map based on the performance on a cluster.

Additional Information: For the heatmap we made up following logic.

  1. Based on all combinations from one cluster (positive control):
  • If two proteins are in the same cluster → +1
  • If two proteins are in different clusters → -1
  • If one or both are classified as noise → 0
  1. Based on two (or more) unrelated proteins (negative control):
  • If they end up in the same cluster as our example proteins → -1
  • If they end up in different clusters → +1
  • If noise → 0 (I’m least sure about this case)
# -------------- PART 1: Creating a loop for all possible combinations for ε between the numbers ( 0.5-1.5) in steps of 0.1 and for MinPts (1-10) in steps of 1 ----------

# 1. Scale data like chunk before 
scaled_data <- scale(clean_merged_filterd_mitosis)
protein_names <- rownames(clean_merged_filterd_mitosis)
n_proteins <- nrow(scaled_data)

# 2. Create empty data frame to collect results
results_df <- data.frame(row.names = protein_names)

# 3. Loop over all combinations
for (eps in seq(0.5, 1.5, by = 0.1)) {
  for (minPts in 1:10) {
    
    # Run DBSCAN
    db <- dbscan(scaled_data, eps = eps, minPts = minPts) # runs combinations 
    
    # Store cluster assignments
    col_name <- paste0("eps_", eps, "_minPts_", minPts)
    results_df[[col_name]] <- db$cluster
  }
}

# 4. Preview result
head(results_df)
##             eps_0.5_minPts_1 eps_0.5_minPts_2 eps_0.5_minPts_3 eps_0.5_minPts_4
## ACL6A_HUMAN                1                0                0                0
## AFF1_HUMAN                 2                0                0                0
## AIMP1_HUMAN                3                1               10                0
## AK17A_HUMAN                4                0                0                0
## AKAP8_HUMAN                5                0                0                0
## AKP8L_HUMAN                3                1               10                0
##             eps_0.5_minPts_5 eps_0.5_minPts_6 eps_0.5_minPts_7 eps_0.5_minPts_8
## ACL6A_HUMAN                0                0                0                0
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                0                0                0                0
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                0                0                0                0
## AKP8L_HUMAN                0                0                0                0
##             eps_0.5_minPts_9 eps_0.5_minPts_10 eps_0.6_minPts_1
## ACL6A_HUMAN                0                 0                1
## AFF1_HUMAN                 0                 0                2
## AIMP1_HUMAN                0                 0                3
## AK17A_HUMAN                0                 0                4
## AKAP8_HUMAN                0                 0                5
## AKP8L_HUMAN                0                 0                3
##             eps_0.6_minPts_2 eps_0.6_minPts_3 eps_0.6_minPts_4 eps_0.6_minPts_5
## ACL6A_HUMAN                0                0                0                0
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                1                0
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                0                0                0                0
## AKP8L_HUMAN                1                1                1                0
##             eps_0.6_minPts_6 eps_0.6_minPts_7 eps_0.6_minPts_8 eps_0.6_minPts_9
## ACL6A_HUMAN                0                0                0                0
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                0                0                0                0
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                0                0                0                0
## AKP8L_HUMAN                0                0                0                0
##             eps_0.6_minPts_10 eps_0.7_minPts_1 eps_0.7_minPts_2
## ACL6A_HUMAN                 0                1                0
## AFF1_HUMAN                  0                2                0
## AIMP1_HUMAN                 0                3                1
## AK17A_HUMAN                 0                4                0
## AKAP8_HUMAN                 0                5                2
## AKP8L_HUMAN                 0                3                1
##             eps_0.7_minPts_3 eps_0.7_minPts_4 eps_0.7_minPts_5 eps_0.7_minPts_6
## ACL6A_HUMAN                0                0                0                0
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                0                0
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN               14                8                0                0
## AKP8L_HUMAN                1                1                0                0
##             eps_0.7_minPts_7 eps_0.7_minPts_8 eps_0.7_minPts_9
## ACL6A_HUMAN                0                0                0
## AFF1_HUMAN                 0                0                0
## AIMP1_HUMAN                0                0                0
## AK17A_HUMAN                0                0                0
## AKAP8_HUMAN                0                0                0
## AKP8L_HUMAN                0                0                0
##             eps_0.7_minPts_10 eps_0.8_minPts_1 eps_0.8_minPts_2
## ACL6A_HUMAN                 0                1                0
## AFF1_HUMAN                  0                2                0
## AIMP1_HUMAN                 0                3                1
## AK17A_HUMAN                 0                4                0
## AKAP8_HUMAN                 0                5                2
## AKP8L_HUMAN                 0                3                1
##             eps_0.8_minPts_3 eps_0.8_minPts_4 eps_0.8_minPts_5 eps_0.8_minPts_6
## ACL6A_HUMAN                0                0                0                0
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                1                1
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                2               12                8                0
## AKP8L_HUMAN                1                1                1                1
##             eps_0.8_minPts_7 eps_0.8_minPts_8 eps_0.8_minPts_9
## ACL6A_HUMAN                0                0                0
## AFF1_HUMAN                 0                0                0
## AIMP1_HUMAN                1                1                0
## AK17A_HUMAN                0                0                0
## AKAP8_HUMAN                0                0                0
## AKP8L_HUMAN                1                1                0
##             eps_0.8_minPts_10 eps_0.9_minPts_1 eps_0.9_minPts_2
## ACL6A_HUMAN                 0                1                1
## AFF1_HUMAN                  0                2                0
## AIMP1_HUMAN                 0                1                1
## AK17A_HUMAN                 0                3                0
## AKAP8_HUMAN                 0                4                2
## AKP8L_HUMAN                 0                1                1
##             eps_0.9_minPts_3 eps_0.9_minPts_4 eps_0.9_minPts_5 eps_0.9_minPts_6
## ACL6A_HUMAN                1                1                0                0
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                1                1
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                2                2                2                0
## AKP8L_HUMAN                1                1                1                1
##             eps_0.9_minPts_7 eps_0.9_minPts_8 eps_0.9_minPts_9
## ACL6A_HUMAN                0                0                0
## AFF1_HUMAN                 0                0                0
## AIMP1_HUMAN                1                1                1
## AK17A_HUMAN                0                0                0
## AKAP8_HUMAN                0                0                0
## AKP8L_HUMAN                1                1                1
##             eps_0.9_minPts_10 eps_1_minPts_1 eps_1_minPts_2 eps_1_minPts_3
## ACL6A_HUMAN                 0              1              1              1
## AFF1_HUMAN                  0              2              0              0
## AIMP1_HUMAN                 2              1              1              1
## AK17A_HUMAN                 0              3              0              0
## AKAP8_HUMAN                 0              1              1              1
## AKP8L_HUMAN                 2              1              1              1
##             eps_1_minPts_4 eps_1_minPts_5 eps_1_minPts_6 eps_1_minPts_7
## ACL6A_HUMAN              1              1              1              1
## AFF1_HUMAN               0              0              0              0
## AIMP1_HUMAN              1              1              1              1
## AK17A_HUMAN              0              0              0              0
## AKAP8_HUMAN              1              1              4              4
## AKP8L_HUMAN              1              1              1              1
##             eps_1_minPts_8 eps_1_minPts_9 eps_1_minPts_10 eps_1.1_minPts_1
## ACL6A_HUMAN              1              1               0                1
## AFF1_HUMAN               0              0               0                2
## AIMP1_HUMAN              1              1               1                1
## AK17A_HUMAN              0              0               0                3
## AKAP8_HUMAN              0              0               0                1
## AKP8L_HUMAN              1              1               1                1
##             eps_1.1_minPts_2 eps_1.1_minPts_3 eps_1.1_minPts_4 eps_1.1_minPts_5
## ACL6A_HUMAN                1                1                1                1
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                1                1
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                1                1                1                1
## AKP8L_HUMAN                1                1                1                1
##             eps_1.1_minPts_6 eps_1.1_minPts_7 eps_1.1_minPts_8 eps_1.1_minPts_9
## ACL6A_HUMAN                1                1                1                1
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                1                1
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                1                1                1                3
## AKP8L_HUMAN                1                1                1                1
##             eps_1.1_minPts_10 eps_1.2_minPts_1 eps_1.2_minPts_2
## ACL6A_HUMAN                 1                1                1
## AFF1_HUMAN                  0                2                0
## AIMP1_HUMAN                 1                1                1
## AK17A_HUMAN                 0                3                0
## AKAP8_HUMAN                 4                1                1
## AKP8L_HUMAN                 1                1                1
##             eps_1.2_minPts_3 eps_1.2_minPts_4 eps_1.2_minPts_5 eps_1.2_minPts_6
## ACL6A_HUMAN                1                1                1                1
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                1                1
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                1                1                1                1
## AKP8L_HUMAN                1                1                1                1
##             eps_1.2_minPts_7 eps_1.2_minPts_8 eps_1.2_minPts_9
## ACL6A_HUMAN                1                1                1
## AFF1_HUMAN                 0                0                0
## AIMP1_HUMAN                1                1                1
## AK17A_HUMAN                0                0                0
## AKAP8_HUMAN                1                1                1
## AKP8L_HUMAN                1                1                1
##             eps_1.2_minPts_10 eps_1.3_minPts_1 eps_1.3_minPts_2
## ACL6A_HUMAN                 1                1                1
## AFF1_HUMAN                  0                2                0
## AIMP1_HUMAN                 1                1                1
## AK17A_HUMAN                 0                3                0
## AKAP8_HUMAN                 1                1                1
## AKP8L_HUMAN                 1                1                1
##             eps_1.3_minPts_3 eps_1.3_minPts_4 eps_1.3_minPts_5 eps_1.3_minPts_6
## ACL6A_HUMAN                1                1                1                1
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                1                1
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                1                1                1                1
## AKP8L_HUMAN                1                1                1                1
##             eps_1.3_minPts_7 eps_1.3_minPts_8 eps_1.3_minPts_9
## ACL6A_HUMAN                1                1                1
## AFF1_HUMAN                 0                0                0
## AIMP1_HUMAN                1                1                1
## AK17A_HUMAN                0                0                0
## AKAP8_HUMAN                1                1                1
## AKP8L_HUMAN                1                1                1
##             eps_1.3_minPts_10 eps_1.4_minPts_1 eps_1.4_minPts_2
## ACL6A_HUMAN                 1                1                1
## AFF1_HUMAN                  0                2                0
## AIMP1_HUMAN                 1                1                1
## AK17A_HUMAN                 0                3                0
## AKAP8_HUMAN                 1                1                1
## AKP8L_HUMAN                 1                1                1
##             eps_1.4_minPts_3 eps_1.4_minPts_4 eps_1.4_minPts_5 eps_1.4_minPts_6
## ACL6A_HUMAN                1                1                1                1
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                1                1
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                1                1                1                1
## AKP8L_HUMAN                1                1                1                1
##             eps_1.4_minPts_7 eps_1.4_minPts_8 eps_1.4_minPts_9
## ACL6A_HUMAN                1                1                1
## AFF1_HUMAN                 0                0                0
## AIMP1_HUMAN                1                1                1
## AK17A_HUMAN                0                0                0
## AKAP8_HUMAN                1                1                1
## AKP8L_HUMAN                1                1                1
##             eps_1.4_minPts_10 eps_1.5_minPts_1 eps_1.5_minPts_2
## ACL6A_HUMAN                 1                1                1
## AFF1_HUMAN                  0                2                0
## AIMP1_HUMAN                 1                1                1
## AK17A_HUMAN                 0                3                0
## AKAP8_HUMAN                 1                1                1
## AKP8L_HUMAN                 1                1                1
##             eps_1.5_minPts_3 eps_1.5_minPts_4 eps_1.5_minPts_5 eps_1.5_minPts_6
## ACL6A_HUMAN                1                1                1                1
## AFF1_HUMAN                 0                0                0                0
## AIMP1_HUMAN                1                1                1                1
## AK17A_HUMAN                0                0                0                0
## AKAP8_HUMAN                1                1                1                1
## AKP8L_HUMAN                1                1                1                1
##             eps_1.5_minPts_7 eps_1.5_minPts_8 eps_1.5_minPts_9
## ACL6A_HUMAN                1                1                1
## AFF1_HUMAN                 0                0                0
## AIMP1_HUMAN                1                1                1
## AK17A_HUMAN                0                0                0
## AKAP8_HUMAN                1                1                1
## AKP8L_HUMAN                1                1                1
##             eps_1.5_minPts_10
## ACL6A_HUMAN                 1
## AFF1_HUMAN                  0
## AIMP1_HUMAN                 1
## AK17A_HUMAN                 0
## AKAP8_HUMAN                 1
## AKP8L_HUMAN                 1
# -------------- PART 2: Creating a loop for all possible combinations for ε between the numbers ( 0.5-1.5) in steps of 0.1 and for MinPts (1-10) in steps of 1 ----------


# 1)Define your proteins of interest ( from Corum Data)

target_proteins <- c("RS12_HUMAN", "RS26_HUMAN", "RS2_HUMAN", "RS6_HUMAN") # this would be positiv controll, a complex we know about. 

negative_controls <- c("LPPRC_HUMAN", "UIMC1_HUMAN")

all_proteins <- c(target_proteins, negative_controls)

# 2) create all pairwise combinations (postiv control)

protein_pairs <- combn(all_proteins, 2, simplify = FALSE) 

# --- Step 4: scoring for each DBSCAN setting ---

score_results <- data.frame()

for (col_name in colnames(results_df)) { # selcts all coloums from DBSCAN combinations 
  
  cluster_assignments <- as.numeric(results_df[[col_name]]) # extracts data and converts to numeric
  
  names(cluster_assignments) <- rownames(results_df) # assignes its cluster number back to its protein name 
  
  pair_scores <- c()

  for (pair in protein_pairs) {
    p1 <- pair[1] # selects first protein 
    p2 <- pair[2] # selects second protein 
    c1 <- cluster_assignments[p1] 
    c2 <- cluster_assignments[p2]

    # Skip if NA
    if (is.na(c1) || is.na(c2)) {
      score <- NA

    # Both are targets
    } else if (p1 %in% target_proteins && p2 %in% target_proteins) {
      if (c1 == 0 || c2 == 0) {
        score <- 0 # noise is always 0
      } else if (c1 == c2) {
        score <- 1 # here when the two clusters are equal, its good 
      } else {
        score <- -1
      }

    # One is negative control, one is target
    } else if ((p1 %in% negative_controls && p2 %in% target_proteins) || 
               (p2 %in% negative_controls && p1 %in% target_proteins)) {
      if (c1 == 0 || c2 == 0) {
        score <- 0 # noise is always 0
      } else if (c1 == c2) {
        score <- -1 # here when the two clusters are equal its bad 
      } else {
        score <- 1  # Properly separated
      }

    # Both are negative controls – don't score this pair
    } else {
      next # skips scoring booth negatives 
    }

    pair_scores <- c(pair_scores, score) # apart from neg neg pairs all resulting scores are listed  
  }

  avg_score <- mean(pair_scores, na.rm = TRUE) # computes the avaredge 

  parts <- strcapture("eps_([0-9.]+)_minPts_([0-9]+)", col_name, data.frame(eps = 0, minPts = 0))# extracts from colnames the values for eps and minPts as numbers 
  
  score_results <- rbind(score_results, data.frame( # three columns 
    eps = as.numeric(parts$eps),
    minPts = as.numeric(parts$minPts),
    score = avg_score
  ))
}

# --- Step 6: Plot heatmap ---
ggplot(score_results, aes(x = eps, y = minPts, fill = score)) +
  geom_tile(color = "grey70") +
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, limits = c(-1, 1)) +
  labs(title = "Clustering Consistency of Protein Pairs",
       subtitle = "Based on DBSCAN parameter combinations",
       x = "Epsilon (ε)",
       y = "MinPts",
       fill = "Avg. Pair Score") +
  theme_minimal()